knitr::opts_chunk$set(
echo = TRUE,
include = TRUE,
message = FALSE,
warning = FALSE,
fig.width = 12,
fig.asp = .6,
fig.align = "center",
out.width = "90%"
)
library(tidyverse)
library(dplyr)
library(arsenal)
library(HH)
library(leaps)
library(corrplot)
library(faraway)
library(ggpubr)
library(broom)
library(ggplot2)
library(MASS)
library(patchwork)
hate_crime = read_csv("data/HateCrimes.csv", col_types = "fffdddddd") %>%
janitor::clean_names() %>%
drop_na()
| Overall (N=45) | |
|---|---|
| Level of unemployment | |
| - high | 23 (51.1%) |
| - low | 22 (48.9%) |
| Level of state urbanization | |
| - low | 21 (46.7%) |
| - high | 24 (53.3%) |
| Median Household Income | |
| - Mean (SD) | 55299.49 (8979.49) |
| - Median (Q1, Q3) | 54916.00 (48060.00, 60708.00) |
| - Min - Max | 39552.00 - 76165.00 |
| Percent of adults with a high school degree | |
| - Mean (SD) | 0.87 (0.03) |
| - Median (Q1, Q3) | 0.87 (0.84, 0.89) |
| - Min - Max | 0.80 - 0.92 |
| Percent of population that are not US citizens | |
| - Mean (SD) | 0.06 (0.03) |
| - Median (Q1, Q3) | 0.05 (0.03, 0.08) |
| - Min - Max | 0.01 - 0.13 |
| Income inequality index | |
| - Mean (SD) | 0.46 (0.02) |
| - Median (Q1, Q3) | 0.46 (0.44, 0.47) |
| - Min - Max | 0.42 - 0.53 |
| Percent of population that are non-white | |
| - Mean (SD) | 0.32 (0.15) |
| - Median (Q1, Q3) | 0.30 (0.21, 0.42) |
| - Min - Max | 0.06 - 0.63 |
| Hate crime rate per 100,000 population | |
| - Mean (SD) | 0.30 (0.25) |
| - Median (Q1, Q3) | 0.23 (0.14, 0.35) |
| - Min - Max | 0.07 - 1.52 |
From the histogram below, we observe our outcome distribution has right skewness, suggesting that we may need to check our normality assumption. Our QQ Plot also indicates severe departures from normality.
#Histogram of Outcome Distribution
hate_crime %>%
ggplot(aes(x = hate_crimes_per_100k_splc)) +
geom_histogram(color = "red", fill = "black") +
labs(
title = "Distribution of Hate Crime Rates in 50 US States",
x = "Hate Crime Rate per 100,000 Population",
y = "Frequency of Distribution",
caption = "Distribution of Hate Crime Rates ( 50 US States)")
#QQplot of Outcome Distribution
hate_crimes_per_100k_splc = hate_crime$hate_crimes_per_100k_splc
qqnorm(hate_crimes_per_100k_splc, col = 2, pch = 19, cex = 1.5)
qq_plot = qqline(hate_crimes_per_100k_splc, col = 1,lwd = 2,lty = 2)
After performing a Shapiro-Wilk test to check the normality assumption of our outcome distribution, we find evidence to suggest that our data deviates from normality.
# Perform Shapiro-Wilk test
shapiro.test(hate_crimes_per_100k_splc) %>%
broom::tidy() %>%
knitr::kable("simple")
| statistic | p.value | method |
|---|---|---|
| 0.7107896 | 0 | Shapiro-Wilk normality test |
We apply a square root transformation and a natural log transformation to our outcome distribution, and compare the results of the data.
sqrt_transformation = hate_crime %>%
ggplot(aes(x = sqrt(hate_crimes_per_100k_splc))) +
geom_histogram(color = "red", fill = "black") +
labs(
title = "Distribution of sqrt(Hate Crime Rates) in 50 US States",
x = "sqrt(Hate Crime Rate per 100,000 Population)",
y = "Frequency of Distribution",
caption = "Distribution of Hate Crime Rates ( 50 US States)")
sqrt_qqplot = ggplot(hate_crime, aes(sample = sqrt(hate_crimes_per_100k_splc))) +
stat_qq() + stat_qq_line() +
labs(
title = "QQ Plot of sqrt(Hate Crime Rates) in 50 US States",
x = "sqrt(Hate Crime Rate per 100,000 Population)",
y = "Frequency of Distribution",
caption = "Distribution of Hate Crime Rates ( 50 US States)")
ln_transformation = hate_crime %>%
ggplot(aes(x = log(hate_crimes_per_100k_splc))) +
geom_histogram(color = "red", fill = "black") +
labs(
title = "Distribution of ln(Hate Crime Rates) in 50 US States",
x = "ln(Hate Crime Rate per 100,000 Population)",
y = "Frequency of Distribution",
caption = "Distribution of Hate Crime Rates ( 50 US States)")
ln_qqplot = ggplot(hate_crime, aes(sample = log(hate_crimes_per_100k_splc))) + stat_qq() + stat_qq_line() +
labs(
title = "QQ Plot of ln(Hate Crime Rates) in 50 US States",
x = "ln(Hate Crime Rate per 100,000 Population)",
y = "Frequency of Distribution",
caption = "Distribution of Hate Crime Rates ( 50 US States)")
After visual inspection, we observe that our natural log transformation may be a good candidate to re-test our normality assumptions.
(sqrt_transformation + ln_transformation) / ( sqrt_qqplot + ln_qqplot)
From the results of our test, we observe that we fail to reject the null (our p-value > 0.05) and can state with 95% confidence that our natural log transformation does not significantly deviate from normality, so we can assume normality henceforth.
shapiro.test(log(hate_crimes_per_100k_splc)) %>%
broom::tidy() %>%
knitr::kable("simple", caption = "Shapiro Wilk Test")
| statistic | p.value | method |
|---|---|---|
| 0.9830847 | 0.7452961 | Shapiro-Wilk normality test |
hate_crime = hate_crime %>%
mutate(
ln_hate_crimes_per_100k_splc = log(hate_crimes_per_100k_splc)
)
hate_crime %>%
ggplot(aes(x = ln_hate_crimes_per_100k_splc, y = state, colors = state)) +
geom_col(color = "blue") +
labs(
title = "Outlier Analysis of 50 US States",
x = "ln(Hate Crime Rate per 100,000 Population)",
y = "Frequency of Distribution",
caption = "Distribution of Hate Crime Rates (50 US States)"
)
Upon Plotting a column graph of the hate crimes against their respective states, we can see that Wyoming, South Dakota, and North Dakota had no values and District of Columbia, Washington, Oregon, Minnesota, Massachusetts and Maine showed relatively large columns.
After Plotting a scatter plot of the same values, it was evident that these states were outliers that influenced the data set.
hate_crime %>%
ggplot(aes(y = hate_crimes_per_100k_splc, x = state, colors = state)) +
geom_point(aes(color = state)) +
geom_smooth(method = "lm", se = F, color = "red") +
theme(axis.text.x = element_text(angle = 90),
legend.position = "none")
We verify if the association between income inequality (median household income in this case), holds true, as well as explore associations of all the other covariates mentioned above and draw your own conclusions about each predictor’s significance.
hate_crime %>%
dplyr::select(-state,-unemployment,-urbanization) %>%
cor() %>%
knitr::kable(digits = 2)
| median_household_income | perc_population_with_high_school_degree | perc_non_citizen | gini_index | perc_non_white | hate_crimes_per_100k_splc | ln_hate_crimes_per_100k_splc | |
|---|---|---|---|---|---|---|---|
| median_household_income | 1.00 | 0.65 | 0.30 | -0.13 | 0.04 | 0.34 | 0.31 |
| perc_population_with_high_school_degree | 0.65 | 1.00 | -0.26 | -0.54 | -0.50 | 0.26 | 0.30 |
| perc_non_citizen | 0.30 | -0.26 | 1.00 | 0.48 | 0.75 | 0.24 | 0.14 |
| gini_index | -0.13 | -0.54 | 0.48 | 1.00 | 0.55 | 0.38 | 0.22 |
| perc_non_white | 0.04 | -0.50 | 0.75 | 0.55 | 1.00 | 0.11 | -0.01 |
| hate_crimes_per_100k_splc | 0.34 | 0.26 | 0.24 | 0.38 | 0.11 | 1.00 | 0.89 |
| ln_hate_crimes_per_100k_splc | 0.31 | 0.30 | 0.14 | 0.22 | -0.01 | 0.89 | 1.00 |
hate_crime %>%
dplyr::select(-state,-unemployment,-urbanization) %>% #removing factor variables
cor() %>%
corrplot::corrplot(method = "circle", type = "upper", diag = FALSE)
a = ggscatter(hate_crime, x = "median_household_income", y = "hate_crimes_per_100k_splc",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Median Household Income", ylab = "Hate Crime Rate (per 100k pop.)")
b = ggscatter(hate_crime, x = "perc_population_with_high_school_degree", y = "hate_crimes_per_100k_splc",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "% of People 25+ with High School Degree", ylab = "Hate Crime Rate (per 100k pop.)")
c = ggscatter(hate_crime, x = "perc_non_citizen", y = "hate_crimes_per_100k_splc",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "% of People Non-US Citizens", ylab = "Hate Crime Rate (per 100k pop.)")
d = ggscatter(hate_crime, x = "gini_index", y = "hate_crimes_per_100k_splc",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Income Inequality Index (0-100)", ylab = "Hate Crime Rate (per 100k pop.)")
e = ggscatter(hate_crime, x = "perc_non_white", y = "hate_crimes_per_100k_splc",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "% of People Non-White", ylab = "Hate Crime Rate (per 100k pop.)")
From our results, we observe that predictors, “gini_index” and “median_household_income” have the highest correlations to our outcome of interest.
(a + b + c) / (d + e)
# Scatter plot showing associations between numeric variables
hate_crime %>%
dplyr::select(-state,-unemployment,-urbanization) %>%
pairs()
# fitting MLR model on tidy data without state variable
mult_fit <-
lm(
ln_hate_crimes_per_100k_splc ~ unemployment + urbanization + median_household_income + perc_population_with_high_school_degree + perc_non_citizen + gini_index + perc_non_white,
data = hate_crime
)
vif(mult_fit) %>% knitr::kable("simple")
| x | |
|---|---|
| unemploymentlow | 1.426492 |
| urbanizationhigh | 1.983246 |
| median_household_income | 3.108161 |
| perc_population_with_high_school_degree | 3.895361 |
| perc_non_citizen | 3.728286 |
| gini_index | 1.845436 |
| perc_non_white | 3.236419 |
All the predictors have a VIF below 5. This suggests that it would not be problematic to include them in the construction of the model. However, the correlation analysis shows that variables perc_non_white and perc_non_citizen have a moderate linear relationship with a correlation coefficient of 0.75.
ggplot(hate_crimedft,
aes(
x = gini_index,
y = ln_hate_crimes_per_100k_splc,
colour = factor(unemployment)
)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = F,
aes(
group = factor(unemployment),
color = factor(unemployment)
)) +
labs(title = "ln(Hate crime per 100k) people vs. Unemployment Status",
x = "ln(gini index)", y = "ln(hate crime per 100k people)") +
scale_color_manual(
name = "Unemployment",
labels = c("Low", "High"),
values = c("blue", "red")
)
reg1t <-
lm(ln_hate_crimes_per_100k_splc ~ gini_index * factor(unemployment),
data = hate_crimedft)
summary(reg1t)
##
## Call:
## lm(formula = ln_hate_crimes_per_100k_splc ~ gini_index * factor(unemployment),
## data = hate_crimedft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08066 -0.39519 -0.00407 0.30086 1.51569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0684 2.6960 0.767 0.447
## gini_index 2.9069 2.3159 1.255 0.217
## factor(unemployment)1 0.5293 3.6641 0.144 0.886
## gini_index:factor(unemployment)1 0.8122 3.2178 0.252 0.802
##
## Residual standard error: 0.6305 on 41 degrees of freedom
## Multiple R-squared: 0.1214, Adjusted R-squared: 0.05711
## F-statistic: 1.888 on 3 and 41 DF, p-value: 0.1466
There is no significant interaction at 5% significance level. The relationship between hate crime per 100k people and income equality does not vary by unemployment status.
#Scatter plot - Hate_crime_per_100k_splc vs. gini index by urbanization
ggplot(hate_crimedft, aes(x =gini_index, y = ln_hate_crimes_per_100k_splc, colour = factor(urbanization))) +
geom_point(size = 2) +
geom_smooth(method = "lm", se = F,
aes(group = factor(urbanization),
color = factor(urbanization))) +
labs(title = "ln(Hate crime per 100k people) vs. income equality by urbanization status",
x = "ln(gini index)", y = "ln(hate crime per 100k people)") +
scale_color_manual(name = "Urbanization", labels = c("Low", "High"), values = c("blue", "red"))
reg2t <- lm(ln_hate_crimes_per_100k_splc ~ gini_index*factor(urbanization), data = hate_crimedft)
summary(reg2t)
##
## Call:
## lm(formula = ln_hate_crimes_per_100k_splc ~ gini_index * factor(urbanization),
## data = hate_crimedft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23999 -0.42661 -0.03661 0.42869 1.25787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.019 3.464 -0.872 0.388
## gini_index -1.263 2.970 -0.425 0.673
## factor(urbanization)1 4.834 4.113 1.175 0.247
## gini_index:factor(urbanization)1 4.081 3.579 1.140 0.261
##
## Residual standard error: 0.6443 on 41 degrees of freedom
## Multiple R-squared: 0.08241, Adjusted R-squared: 0.01527
## F-statistic: 1.227 on 3 and 41 DF, p-value: 0.312
There is no significant interaction at 5% significance level.The relationship between hate crime per 100k people and income equality does not vary by urbanization status.
ggplot(
hate_crimedft,
aes(
x = perc_population_with_high_school_degree,
y = ln_hate_crimes_per_100k_splc,
colour = factor(unemployment)
)
) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = F,
aes(
group = factor(unemployment),
color = factor(unemployment)
)) +
labs(title = "Scatterplot of Hate crime per 100k people vs. income equality by unemploymnet status",
x = "ln(percentage of population with high school degree and higher)", y = "ln(hate crime per 100k people)") +
scale_color_manual(
name = "Unemployment",
labels = c("Low", "High"),
values = c("blue", "red")
)
reg11t <-
lm(
ln_hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree * factor(unemployment),
data = hate_crimedft
)
summary(reg11t)
##
## Call:
## lm(formula = ln_hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree *
## factor(unemployment), data = hate_crimedft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2361 -0.3816 0.1036 0.3528 1.7602
##
## Coefficients:
## Estimate
## (Intercept) -1.1389
## perc_population_with_high_school_degree 0.9443
## factor(unemployment)1 0.9631
## perc_population_with_high_school_degree:factor(unemployment)1 4.8987
## Std. Error
## (Intercept) 0.5088
## perc_population_with_high_school_degree 2.6854
## factor(unemployment)1 0.8374
## perc_population_with_high_school_degree:factor(unemployment)1 3.8931
## t value Pr(>|t|)
## (Intercept) -2.238 0.0307
## perc_population_with_high_school_degree 0.352 0.7269
## factor(unemployment)1 1.150 0.2567
## perc_population_with_high_school_degree:factor(unemployment)1 1.258 0.2154
##
## (Intercept) *
## perc_population_with_high_school_degree
## factor(unemployment)1
## perc_population_with_high_school_degree:factor(unemployment)1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.63 on 41 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.05865
## F-statistic: 1.914 on 3 and 41 DF, p-value: 0.1424
There is no significant interaction at 5% significance level. The relationship between hate crime per 100k people and education level does not vary by unemployment status.
#Scatter plot - Hate_crime_per_100k_splc vs. education level by urbanization
ggplot(hate_crimedft, aes(x = perc_population_with_high_school_degree, y = ln_hate_crimes_per_100k_splc, colour = factor(urbanization))) +
geom_point(size = 2) +
geom_smooth(method = "lm", se = F,
aes(group = factor(urbanization),
color = factor(urbanization))) +
labs(title = "Scatterplot of Hate crime per 100k people vs. education level by urbanization status",
x = "ln(percentage of population with high school degree or higher)", y = "ln(hate crime per 100k people)") +
scale_color_manual(name = "Urbanization", labels = c("Low", "High"), values = c("blue", "red"))
reg22t <- lm(ln_hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree*factor(urbanization), data = hate_crimedft)
summary(reg22t)
##
## Call:
## lm(formula = ln_hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree *
## factor(urbanization), data = hate_crimedft)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.37000 -0.40173 0.02948 0.45744 1.62894
##
## Coefficients:
## Estimate
## (Intercept) -1.1362
## perc_population_with_high_school_degree 2.0724
## factor(urbanization)1 1.1960
## perc_population_with_high_school_degree:factor(urbanization)1 4.2938
## Std. Error
## (Intercept) 0.4708
## perc_population_with_high_school_degree 2.2729
## factor(urbanization)1 0.7401
## perc_population_with_high_school_degree:factor(urbanization)1 3.4439
## t value Pr(>|t|)
## (Intercept) -2.413 0.0204
## perc_population_with_high_school_degree 0.912 0.3672
## factor(urbanization)1 1.616 0.1138
## perc_population_with_high_school_degree:factor(urbanization)1 1.247 0.2196
##
## (Intercept) *
## perc_population_with_high_school_degree
## factor(urbanization)1
## perc_population_with_high_school_degree:factor(urbanization)1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6118 on 41 degrees of freedom
## Multiple R-squared: 0.1727, Adjusted R-squared: 0.1122
## F-statistic: 2.853 on 3 and 41 DF, p-value: 0.04888
There is no significant interaction at 5% significance level.The relationship between hate crime per 100k people and education level does not vary by urbanization status.
Check model assumptions and goodness of fit (Vihar Desu)
hate_crime =
hate_crime %>%
drop_na() %>%
dplyr::select(-state)
hate_crime_fit = lm(hate_crimes_per_100k_splc ~., data = hate_crime)
summary(hate_crime_fit)
##
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ ., data = hate_crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21283 -0.04959 0.00530 0.03604 0.32897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.644e+00 1.061e+00 -2.492 0.01744
## unemploymentlow -5.250e-02 3.558e-02 -1.476 0.14872
## urbanizationhigh -6.284e-02 4.152e-02 -1.514 0.13887
## median_household_income -7.984e-08 2.917e-06 -0.027 0.97831
## perc_population_with_high_school_degree 2.008e+00 9.488e-01 2.117 0.04128
## perc_non_citizen 8.814e-01 9.181e-01 0.960 0.34344
## gini_index 3.596e+00 1.069e+00 3.363 0.00184
## perc_non_white 3.125e-02 1.796e-01 0.174 0.86286
## ln_hate_crimes_per_100k_splc 3.010e-01 2.761e-02 10.901 5.89e-13
##
## (Intercept) *
## unemploymentlow
## urbanizationhigh
## median_household_income
## perc_population_with_high_school_degree *
## perc_non_citizen
## gini_index **
## perc_non_white
## ln_hate_crimes_per_100k_splc ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09845 on 36 degrees of freedom
## Multiple R-squared: 0.8747, Adjusted R-squared: 0.8468
## F-statistic: 31.41 on 8 and 36 DF, p-value: 5.285e-14
Box-Cox without transformation
hate_crime_fit %>%
MASS::boxcox()
Take log transformation
hate_crime_log_fit = lm(log(hate_crimes_per_100k_splc) ~ . ,
data = hate_crime)
par(mfrow = c(1, 2))
# without transformation
qqnorm(resid(hate_crime_fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(hate_crime_fit))
title("QQ Plot for Hate crime rate")
# with transformation
qqnorm(resid(hate_crime_log_fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(hate_crime_log_fit))
title("QQ Plot for Ln(Hate crime rate)")
Log transformation is better.
Stepwise selection
step(hate_crime_fit, direction = "backward")
## Start: AIC=-200.68
## hate_crimes_per_100k_splc ~ unemployment + urbanization + median_household_income +
## perc_population_with_high_school_degree + perc_non_citizen +
## gini_index + perc_non_white + ln_hate_crimes_per_100k_splc
##
## Df Sum of Sq RSS AIC
## - median_household_income 1 0.00001 0.34893 -202.68
## - perc_non_white 1 0.00029 0.34922 -202.64
## - perc_non_citizen 1 0.00893 0.35786 -201.54
## <none> 0.34893 -200.68
## - unemployment 1 0.02111 0.37003 -200.04
## - urbanization 1 0.02220 0.37113 -199.90
## - perc_population_with_high_school_degree 1 0.04342 0.39235 -197.40
## - gini_index 1 0.10963 0.45856 -190.38
## - ln_hate_crimes_per_100k_splc 1 1.15185 1.50078 -137.03
##
## Step: AIC=-202.68
## hate_crimes_per_100k_splc ~ unemployment + urbanization + perc_population_with_high_school_degree +
## perc_non_citizen + gini_index + perc_non_white + ln_hate_crimes_per_100k_splc
##
## Df Sum of Sq RSS AIC
## - perc_non_white 1 0.00029 0.34922 -204.64
## - perc_non_citizen 1 0.00973 0.35866 -203.44
## <none> 0.34893 -202.68
## - unemployment 1 0.02144 0.37038 -202.00
## - urbanization 1 0.02248 0.37141 -201.87
## - perc_population_with_high_school_degree 1 0.08754 0.43648 -194.61
## - gini_index 1 0.11054 0.45948 -192.29
## - ln_hate_crimes_per_100k_splc 1 1.15443 1.50336 -138.95
##
## Step: AIC=-204.64
## hate_crimes_per_100k_splc ~ unemployment + urbanization + perc_population_with_high_school_degree +
## perc_non_citizen + gini_index + ln_hate_crimes_per_100k_splc
##
## Df Sum of Sq RSS AIC
## <none> 0.34922 -204.64
## - perc_non_citizen 1 0.01971 0.36893 -204.17
## - urbanization 1 0.02273 0.37195 -203.80
## - unemployment 1 0.02364 0.37286 -203.69
## - perc_population_with_high_school_degree 1 0.09136 0.44058 -196.19
## - gini_index 1 0.11260 0.46182 -194.07
## - ln_hate_crimes_per_100k_splc 1 1.15426 1.50348 -140.95
##
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ unemployment + urbanization +
## perc_population_with_high_school_degree + perc_non_citizen +
## gini_index + ln_hate_crimes_per_100k_splc, data = hate_crime)
##
## Coefficients:
## (Intercept)
## -2.60635
## unemploymentlow
## -0.05393
## urbanizationhigh
## -0.06325
## perc_population_with_high_school_degree
## 1.95945
## perc_non_citizen
## 0.96767
## gini_index
## 3.61009
## ln_hate_crimes_per_100k_splc
## 0.30090
#lm(hate_crimes_per_100k_splc ~ perc_population_with_high_school_degree + gini_index, data = hate_crime)
stepwise_log_fit =
lm(log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + gini_index, data = hate_crime)
par(mfrow = c(2, 2))
plot(stepwise_log_fit)
hate_crime_no_dc =
hate_crime[c(-9),]
Fit the model without DC
hate_crime_fit_no_dc = lm(log(hate_crimes_per_100k_splc) ~ . , data = hate_crime_no_dc)
step(hate_crime_fit_no_dc, direction = "backward")
## Start: AIC=-3232.37
## log(hate_crimes_per_100k_splc) ~ unemployment + urbanization +
## median_household_income + perc_population_with_high_school_degree +
## perc_non_citizen + gini_index + perc_non_white + ln_hate_crimes_per_100k_splc
##
## Df Sum of Sq RSS AIC
## - perc_non_white 1 0.000 0.000 -3365.4
## <none> 0.000 -3232.4
## - perc_non_citizen 1 0.000 0.000 -3232.2
## - perc_population_with_high_school_degree 1 0.000 0.000 -3222.0
## - urbanization 1 0.000 0.000 -3220.0
## - unemployment 1 0.000 0.000 -3209.8
## - median_household_income 1 0.000 0.000 -3193.4
## - gini_index 1 0.000 0.000 -3191.0
## - ln_hate_crimes_per_100k_splc 1 11.909 11.909 -41.5
##
## Step: AIC=-3365.39
## log(hate_crimes_per_100k_splc) ~ unemployment + urbanization +
## median_household_income + perc_population_with_high_school_degree +
## perc_non_citizen + gini_index + ln_hate_crimes_per_100k_splc
##
## Df Sum of Sq RSS AIC
## - median_household_income 1 0.000 0.000 -3426.7
## <none> 0.000 -3365.4
## - urbanization 1 0.000 0.000 -3320.4
## - perc_non_citizen 1 0.000 0.000 -3300.7
## - unemployment 1 0.000 0.000 -3268.2
## - perc_population_with_high_school_degree 1 0.000 0.000 -3215.4
## - gini_index 1 0.000 0.000 -3183.2
## - ln_hate_crimes_per_100k_splc 1 11.983 11.983 -43.2
##
## Step: AIC=-3426.7
## log(hate_crimes_per_100k_splc) ~ unemployment + urbanization +
## perc_population_with_high_school_degree + perc_non_citizen +
## gini_index + ln_hate_crimes_per_100k_splc
##
## Df Sum of Sq RSS AIC
## <none> 0.000 -3426.7
## - gini_index 1 0.000 0.000 -3309.9
## - perc_population_with_high_school_degree 1 0.000 0.000 -3271.3
## - urbanization 1 0.000 0.000 -3188.3
## - perc_non_citizen 1 0.000 0.000 -3169.9
## - unemployment 1 0.000 0.000 -3125.0
## - ln_hate_crimes_per_100k_splc 1 12.048 12.048 -45.0
##
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ unemployment +
## urbanization + perc_population_with_high_school_degree +
## perc_non_citizen + gini_index + ln_hate_crimes_per_100k_splc,
## data = hate_crime_no_dc)
##
## Coefficients:
## (Intercept)
## 0
## unemploymentlow
## 0
## urbanizationhigh
## 0
## perc_population_with_high_school_degree
## 0
## perc_non_citizen
## 0
## gini_index
## 0
## ln_hate_crimes_per_100k_splc
## 1
stepwise_log_fit_no_dc =
lm(log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + gini_index,
data = hate_crime_no_dc)
par(mfrow = c(2, 2))
plot(stepwise_log_fit_no_dc)